library(DT)
## Warning: package 'DT' was built under R version 4.0.5
library(tidyr)
## Warning: package 'tidyr' was built under R version 4.0.5
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.0.5
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
library(plotly)
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
# read in covid df and convert dates column to date type

covid = read.csv("/Users/davidvucic/Desktop/owid-covid-data-160222.csv")
covid$data = as.Date(covid$date)
lag_covid = covid

# select relevant columns and subset of data within a range of dates

lag_covid = select(lag_covid, "date", "location", "people_vaccinated_per_hundred", "people_fully_vaccinated_per_hundred")
start_date = "2021-02-01"
end_date = "2021-08-01"
lag_covid = lag_covid %>% filter(date >= start_date & date < end_date)
# select only certain countries
countries_lag = c("United States", "India", "Brazil", "France", "United Kingdom", "Russia",
              "Turkey", "Italy", "Germany", "Spain", "Argentina",
                "Colombia", "Poland", 'Mexico', "Netherlands",
              "Indonesia", "Ukraine", "South Africa", "Philippines", "Peru", "Belgium",
                "Czechia", "Japan", "Israel")
lag_covid = filter(lag_covid, lag_covid$location %in% countries_lag)

# make all NA values equal to 0

lag_covid$people_vaccinated_per_hundred[is.na(lag_covid$people_vaccinated_per_hundred)] = 0
lag_covid$people_fully_vaccinated_per_hundred[is.na(lag_covid$people_fully_vaccinated_per_hundred)] = 0
# smoother function, returns smoothed column

Lowess <- function(data, f) {
  lowess_fit <- lowess(data, f = f)
  return(lowess_fit$y)
}
lagValue <- function(FirstDose, SecondDose, windowsize)
{
  # vector for all measures of distance between matrices
  dist_vector = c()
  i = 1
  while (i <= windowsize){
    # select different subsets of matrices, calculate the distances between the 2 matrices and store the distance. This while loop will contain information for 1st vaccine lag
    FirstDose_subset <- FirstDose[i:nrow(FirstDose),1]
    SecondDose_subset <- SecondDose[1:(1 - i + nrow(SecondDose)),1]
    dist_FirstDose <- proxy::dist(t(FirstDose_subset), t(SecondDose_subset), method = "cosine")
    dist_vector = c(dist_vector, dist_FirstDose)
    i = i + 1
  }
  
  
  j = 1
  while (j <= windowsize){
    # select different subsets of matrices, calculate the distances between the 2 matrices and store the distance. This while loop will contain information for 2nd vaccine lag
    FirstDose_subset1 <- FirstDose[1:(1 - j + nrow(FirstDose)),1]
    SecondDose_subset1 <- SecondDose[j:nrow(SecondDose),1]
    dist_SecondDose <- proxy::dist(t(FirstDose_subset1), t(SecondDose_subset1), method = "cosine")
    dist_vector = c(dist_vector, dist_SecondDose)
    j = j + 1
  }
  
  # select min value index which corresponds to value of the lag
  return(which.min(dist_vector))
}  
lag_vector <- c()
z = 1
# loop through each country
while (z <= length(countries_lag)){
  # only select records for certain country and only select 1st and 2nd vaccine columns
  lagCovid_filtered = filter(lag_covid, location == countries_lag[z])
  combined_matrix <- cbind(lagCovid_filtered[,3], lagCovid_filtered[,4])
  
  # In the dataset, there are missing values. Will replace these missing values (0) with the value from the date before. Do it for both 1st and 2nd vaccine columns
  
  for (i in 1:nrow(combined_matrix)){
    if (i == 1){
      } else{
      if (combined_matrix[i,1] == 0){
        combined_matrix[i,1] = combined_matrix[i-1, 1]
      }
    }
  }
  
  for (j in 1:nrow(combined_matrix)){
    if (j == 1){
      } else{
      if (combined_matrix[j,2] == 0){
        combined_matrix[j,2] = combined_matrix[j-1, 2]
      }
    }
  }
  
  # Apply smoothing function to 1st and 2nd vaccine columns. f = 0.15 is an arbitrary value
  
  combined_matrix_smooth<- as.matrix(apply(combined_matrix, 2, Lowess, f = 0.15))
  
  # Store each column separately as individual matrices
  FirstDose_matrix = as.matrix(combined_matrix_smooth[,1])
  SecondDose_matrix = as.matrix(combined_matrix_smooth[,2])

  # Graph the 1st and 2nd vaccine percentages as a figure of interest. Need to convert back to dataframe.
  # X axis is in days
  combined_matrix_smooth_df = as.data.frame(combined_matrix_smooth)
  matplot(cbind(combined_matrix_smooth_df[,1], combined_matrix_smooth_df[,2]), type ="l", lty = 1, ylab = "Percentage of Population", xlab = countries_lag[z])
  legend("topleft", c("At Least 1 Vaccine", "2 Vaccines"), lty = 1, col=1:2)
  
  # Input the individual matrices into the lagValue function to find the lag between the 1st and 2nd dose for a particular country
  lag <- lagValue(FirstDose_matrix, SecondDose_matrix, windowsize=100)
  #store value of lag
  lag_vector <- c(lag_vector, lag)
  z = z + 1
}

# label the lag values with the corresponding country
names(lag_vector) <- countries_lag
lagType <- function(lag, windowsize)
{ # Function to convert indice value given by lagValue to a value for the Time Lag.
  # Any lag values that are greater than windowsize were part of the 2nd half of the 'dist_vector' from the lagValue function, the half of the vector for the 2nd vaccine lag.
  # Therefore need to subtract off all the days from the 1st half of the 'dist_vector' to get number of days for 2nd vaccine lag.
  # No such issue for 1st vaccine lag as all values are within first half.
  if (lag > windowsize){
    return(c(LagType = "Second Dose Lag", Lag = lag - windowsize - 1))
  } else {
    return(c(LagType = "First Dose Lag", Lag = lag - 1))
  }
}
# Apply function to each countries Time lag value 
lag_df = mapply(lagType, lag = lag_vector, windowsize = 100)
# Visualise Time lags
datatable(t(lag_df))
# Convert time lag table to dataframe, get required tables, convert time lag to numeric
lag_table = as.data.frame(t(lag_df))
lag_table <- cbind(Country = rownames(lag_table), lag_table)
lag_table$Lag = as.numeric(lag_table$Lag) 

# plot the lag 
fig = plot_ly(data = lag_table, x = ~Country, y = ~Lag, type = 'bar')
fig = fig %>% layout(title = "Lag Between 1st and 2nd COVID-19 Vaccine")
fig